In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
import statsmodels.api as sm
In [2]:
mf = pd.read_csv('../data/mapping_cleaned_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID', na_values='Missing:Not collected')
In [3]:
genus = pd.read_csv('../data/table_mc5876_sorted_L6.txt', skiprows=1, sep='\t', dtype=str, index_col = '#OTU ID')
In [4]:
genus.head()
Out[4]:
In [5]:
# select interested genus (no E.coli found)
genus = genus[genus.index.str.contains(
'Bifidobacterium|Faecalibacterium|Eubacterium|Roseburia \
|Anaerostipes|Bacteroides|Ruminococcus|Lactobacillus|Escherichia \
|Blautia|Ruminococcus|Lachnospira|Collinsella \
|Coprococcus|Phascolarctobacterium|Prevotella',
na=False)]
In [6]:
genus.index
Out[6]:
In [7]:
genus = genus.transpose()
In [8]:
genus.head()
Out[8]:
In [9]:
table = pd.merge(mf, genus, left_index=True, right_index=True)
In [10]:
print(genus.shape)
print(mf.shape)
print(table.shape)
In [11]:
table.head()
Out[11]:
In [12]:
vars_vd = np.array(['OHVD3', 'OHV1D3', 'OHV24D3',
'ratio_activation', 'ratio_catabolism'])
table = table[np.append(list(genus.columns), vars_vd)]
print(table.shape)
In [13]:
table.head()
Out[13]:
In [14]:
table = table.apply(pd.to_numeric, errors='coerce')
In [15]:
table.drop(['k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;Other',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Blautia',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Butyrivibrio',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Dorea',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Epulopiscium',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Lachnobacterium',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Moryella',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Oribacterium'],
axis=1, inplace=True)
In [16]:
table = table.rename(columns = {
'k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium': 'Bifidobacterium',
'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides': 'Bacteroides',
'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella': 'Prevotellaceae_Prevotella',
'k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__[Paraprevotellaceae];g__[Prevotella]': 'Paraprevotellaceae_Prevotella',
'k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus': 'Lactobacillus',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Eubacteriaceae;g__Pseudoramibacter_Eubacterium': 'Pseudoramibacter_Eubacterium',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Anaerostipes': 'Anaerostipes',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Coprococcus': 'Coprococcus',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Lachnospira': 'Lachnospira',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Roseburia': 'Roseburia',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus]': 'Lachnospiraceae_Ruminococcus',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium': 'Faecalibacterium',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus': 'Ruminococcaceae_Ruminococcus',
'k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Phascolarctobacterium': 'Phascolarctobacterium',
'k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__[Eubacterium]': 'Eubacterium'})
In [17]:
table.head()
Out[17]:
In [18]:
table.columns
Out[18]:
In [19]:
#table.to_csv('../data/genus.txt', sep='\t')
In [21]:
var = table.columns[0:15]
print(var)
In [22]:
vars_vd
Out[22]:
In [23]:
df = table.copy()
out = []
k = 0
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHVD3 = out.loc[out.pvalue <= 0.05]
out_OHVD3
Out[23]:
In [27]:
table.head()
Out[27]:
In [23]:
k = 0
bact = out_OHVD3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel='25(OH)D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/genus_reg_VD3.pdf')
ax.savefig('../figures/genus_reg_VD3.png')
In [24]:
out = []
k = 1
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHV1D3 = out.loc[out.pvalue <= 0.05]
out_OHV1D3
Out[24]:
In [25]:
out
Out[25]:
In [26]:
k = 1
bact = out_OHV1D3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel='1,25(OH)2D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/genus_reg_V1D3.pdf')
ax.savefig('../figures/genus_reg_V1D3.png')
In [27]:
out = []
k = 2
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_OHV24D3 = out.loc[out.pvalue <= 0.05]
out_OHV24D3
Out[27]:
In [28]:
k = 2
bact = out_OHV24D3.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel='24,25(OH)2D3 (ng/ml)', ylabel='Relative abundance')
ax.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/genus_reg_24D3.pdf')
ax.savefig('../figures/genus_reg_24D3.png')
In [29]:
out = []
k = 3
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_activation = out.loc[out.pvalue <= 0.05]
out_activation
Out[29]:
In [30]:
k = 3
bact = out_activation.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel=vars_vd[k], ylabel='Relative abundance')
ax.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/genus_reg_act.pdf')
ax.savefig('../figures/genus_reg_act.png')
In [31]:
out = []
k = 4
for i in range(len(var)):
tmp = df[[var[i], vars_vd[k]]].dropna(axis=0, how='any')
y = tmp[vars_vd[k]]
X = tmp[var[i]]
results = smf.OLS(y, sm.add_constant(X)).fit()
# normality test
name = ['Chi^2', 'Two-tail probability']
test = sms.omni_normtest(results.resid)
normtest = lzip(name, test)[1][1]
# condition number
cn = np.linalg.cond(results.model.exog)
# heteroskedasticity tests (null: the residual variance does not depend on the variables in x)
name = ['Lagrange multiplier statistic', 'p-value']
test = sms.het_breuschpagan(results.resid, results.model.exog)
heter = lzip(name, test)[1][1]
# linearity test (null: is linear)
# name = ['t value', 'p value']
# test = sms.linear_harvey_collier(results)
# linear = lzip(name, test)[1][1]
out.append([vars_vd[k], var[i], results.params[1], results.pvalues[1],
results.rsquared_adj, normtest, cn, heter])
out = pd.DataFrame(out, columns=['y', 'X', 'slope', 'pvalue', 'adjusted R-square',
'norm test P-val', 'condition number', 'hetero test P-val'])
out_cabolism = out.loc[out.pvalue <= 0.05]
out_cabolism
Out[31]:
In [32]:
k = 4
bact = out_cabolism.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y=bact[i],
label=bact[i], data=table)
ax.set(xlabel=vars_vd[k], ylabel='Relative abundance')
ax.legend()
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax = ax.get_figure()
ax.tight_layout()
ax.savefig('../figures/genus_reg_cat.pdf')
ax.savefig('../figures/genus_reg_cat.png')
In [33]:
k = 4
bact = out_cabolism.X.values
sns.set(color_codes=True)
sns.set_palette("husl", n_colors=len(bact))
for i in range(len(bact)):
ax = sns.regplot(x=vars_vd[k], y='Lactobacillus',
label=bact[i], data=table)
ax.set(xlabel=vars_vd[k], ylabel='Relative abundance')
ax.legend()
Out[33]:
In [34]:
table.describe()
Out[34]:
In [35]:
# clustermap: http://seaborn.pydata.org/generated/seaborn.clustermap.html